Load preprocessed dataset (preprocessing code in data_preprocessing.Rmd)


All samples together

plot_data = data.frame('ID'=rownames(datExpr), 'Mean'=rowMeans(datExpr))
p1 = ggplotly(plot_data %>% ggplot(aes(Mean)) + geom_density(color='#0099cc', fill='#0099cc', alpha=0.3) +
              scale_x_log10() + theme_minimal())

plot_data = data.frame('ID'=colnames(datExpr), 'Mean'=colMeans(datExpr))
p2 = ggplotly(plot_data %>% ggplot(aes(Mean)) + geom_density(color='#0099cc', fill='#0099cc', alpha=0.3) +
              theme_minimal() + ggtitle('Mean expression density by Probe (left) and by Sample (right)'))

subplot(p1, p2, nrows=1)
rm(p1, p2, plot_data)
plot_data = data.frame('ID'=rownames(datExpr), 'Mean'=rowMeans(datExpr)) %>% 
            left_join(GO_neuronal, by='ID') %>% mutate('Neuronal'=ifelse(is.na(Neuronal),F,T))
p1 = plot_data %>% ggplot(aes(Mean, color=Neuronal, fill=Neuronal)) + geom_density(alpha=0.3) +
                   scale_x_log10() + theme_minimal() + theme(legend.position='bottom') + 
                   ggtitle('Mean expression density by Probe')

plot_data = data.frame('ID'=colnames(datExpr), 'Mean'=colMeans(datExpr)) %>% 
            mutate('ID'=substring(ID,2)) %>% left_join(datMeta, by=c('ID'='Dissected_Sample_ID'))
p2 = plot_data %>% ggplot(aes(Mean, color=Diagnosis_, fill=Diagnosis_)) + geom_density(alpha=0.3) +
                   theme_minimal() + theme(legend.position='bottom') + 
                   ggtitle('Mean expression density by Sample')

grid.arrange(p1, p2, nrow=1)

rm(GO_annotations, GO_neuronal, plot_data, p1, p2)


ASD vs CTL

In general there doesn’t seem to be a lot of variance in mean expression between autism and control samples by probe. Biggest outliers are ENSG00000173110, ENSG00000234449 and ENSG00000143858.

plot_data = data.frame('ID'=rownames(datExpr),
                       'ASD'=rowMeans(datExpr[,datMeta$Diagnosis_=='ASD']),
                       'CTL'=rowMeans(datExpr[,datMeta$Diagnosis_!='ASD']))

plot_data %>% ggplot(aes(ASD,CTL)) + geom_point(alpha=0.1, color='#0099cc') + 
         geom_abline() + ggtitle('Mean expression ASD vs CTL') + theme_minimal()

plot_data = rbind(data.frame('Mean'=rowMeans(datExpr[,datMeta$Diagnosis_=='ASD']), 'Diagnosis'='ASD'),
                  data.frame('Mean'=rowMeans(datExpr[,datMeta$Diagnosis_!='ASD']), 'Diagnosis'='CTL'))
p1 = ggplotly(plot_data %>% ggplot(aes(Mean, color=Diagnosis, fill=Diagnosis)) + 
              geom_density(alpha=0.3) + scale_x_log10() + theme_minimal())

plot_data = rbind(data.frame('Mean'=colMeans(datExpr[,datMeta$Diagnosis_=='ASD']), 'Diagnosis'='ASD'),
                  data.frame('Mean'=colMeans(datExpr[,datMeta$Diagnosis_!='ASD']), 'Diagnosis'='CTL'))
p2 = ggplotly(plot_data %>% ggplot(aes(Mean, color=Diagnosis, fill=Diagnosis)) + 
              geom_density(alpha=0.3) + theme_minimal() +
              ggtitle('Mean expression by Probe (left) and by Sample (right) grouped by Diagnosis'))

subplot(p1, p2, nrows=1)
rm(p1, p2, plot_data)



Visualisations


Samples

PCA

Autism samples have a wider dispersion than controls

pca = datExpr %>% t %>% prcomp

plot_data = data.frame('ID'=colnames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>% 
            mutate('ID'=substring(ID,2)) %>% left_join(datMeta, by=c('ID'='Dissected_Sample_ID')) %>% 
            dplyr::select('PC1','PC2','Diagnosis_') %>% 
            mutate('Diagnosis_'=factor(Diagnosis_, levels=c('ASD','CTL')))

plot_data %>% ggplot(aes(PC1, PC2, color=Diagnosis_)) + geom_point() + theme_minimal() + ggtitle('PCA') +
              xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
              ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)'))

rm(pca, plot_data)


MDS

Looks exactly the same as the PCA visualisation, just inverting the y-axis

d = datExpr %>% t %>% dist
fit = cmdscale(d, k=2)

plot_data = data.frame('ID'=colnames(datExpr), 'C1'=fit[,1], 'C2'=fit[,2]) %>%
            mutate('ID'=substring(ID,2)) %>% left_join(datMeta, by=c('ID'='Dissected_Sample_ID')) %>% 
            dplyr::select('C1','C2','Diagnosis_') %>%
            mutate('Diagnosis_'=factor(Diagnosis_, levels=c('ASD','CTL')))

plot_data %>% ggplot(aes(C1, C2, color=Diagnosis_)) + geom_point() + theme_minimal() + ggtitle('MDS')

rm(d, fit, plot_data)


t-SNE

Control samples seem to remain in the center of the distribution and the autism ones in the outline

perplexities = c(5,10,15,20)
ps= list()

for(i in 1:length(perplexities)){
  set.seed(123)
  tsne = datExpr %>% t %>% Rtsne(perplexity=perplexities[i])
  plot_data = data.frame('ID'=colnames(datExpr), 'C1'=tsne$Y[,1], 'C2'=tsne$Y[,2]) %>%
              mutate('ID'=substring(ID,2)) %>% left_join(datMeta, by=c('ID'='Dissected_Sample_ID')) %>%
              dplyr::select('C1','C2','Diagnosis_') %>%
              mutate('Diagnosis_'=factor(Diagnosis_, levels=c('ASD','CTL')))
  ps[[i]] = plot_data %>% ggplot(aes(C1, C2, color=Diagnosis_)) + geom_point() + theme_minimal() +
            ggtitle(paste0('Perplexity=',perplexities[i])) + theme(legend.position='none')
}

grid.arrange(grobs=ps, nrow=2)

rm(ps, plot_data, perplexities, tsne)


Genes

PCA

  • First Principal Component explains almost 97% of the total variance

  • There’s a really strong correlation between the mean expression of a probe and the 1st principal component

pca = datExpr %>% prcomp

plot_data = data.frame( 'PC1' = pca$x[,1], 'PC2' = pca$x[,2], 'MeanExpr'=rowMeans(datExpr))

plot_data %>% ggplot(aes(PC1, PC2, color=MeanExpr)) + geom_point(alpha=0.3) + theme_minimal() + 
              scale_color_viridis() + ggtitle('PCA') +
              xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
              ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)'))

rm(pca, plot_data)


MDS

Distance matrix is too heavy to calculate and the resulting distance object is to big to even load (3.4GB)

t-SNE

perplexities = c(1,2,5,10,50,100)
ps= list()

for(i in 1:length(perplexities)){
  tsne = read.csv(paste0('./../Data/Gandal/Visualisations/tsne_perplexity_',perplexities[i],'.csv'))
  plot_data = data.frame('C1'=tsne[,1], 'C2'=tsne[,2], 'MeanExpr'=rowMeans(datExpr))
  ps[[i]] = plot_data %>% ggplot(aes(C1, C2, color=MeanExpr)) + geom_point(alpha=0.2) + theme_minimal() +
            scale_color_viridis() + ggtitle(paste0('Perplexity=',perplexities[i])) + theme(legend.position='none')
}

grid.arrange(grobs=ps, nrow=2)

rm(ps, plot_data, perplexities, tsne, i)


Differential Expression Analysis

1055 probes (~4%) are significant using a threshold of 0.05 for the adjusted p-value and log2(1.2) log Fold Change

# It's ok to filter DE genes this way because the original filtering threhsold was log2(1.2), if not you would need to do:
# DE_info = results(dds, lfcThreshold=log2(1.2), altHypothesis='greaterAbs')
table(abs(DE_info$log2FoldChange)>log2(1.2), DE_info$padj<0.05)
##        
##         FALSE  TRUE
##   FALSE 19032     0
##   TRUE  10025  1055
DE_info$significant = abs(DE_info$log2FoldChange)>log2(1.2) & DE_info$padj<0.05
p = DE_info %>% ggplot(aes(log2FoldChange, padj, color=significant)) + geom_point(alpha=0.2) + 
    scale_y_sqrt() + xlab('log2 Fold Change') + ylab('Adjusted p-value') + theme_minimal()
ggExtra::ggMarginal(p, type = 'density', color='gray', fill='gray', size=10)

rm(p)
  • There seems to be a negative correlation between log Fold Change and the mean expression of the probes (there shouldn’t be any correlation)

  • There are more over than under expressed probes related to Autism

DE_info %>% ggplot(aes(baseMean, log2FoldChange, color=significant)) + geom_point(alpha=0.3) +
            geom_smooth(aes(baseMean, log2FoldChange), method='lm', inherit.aes=FALSE, color='gray') + 
            scale_x_log10() + theme_minimal()

  • When filtering for differential expression, the points seem to separate ino two clouds

  • The top cloud corresponds to the under expressed probes and the bottom to the under expressed ones

datExpr_DE = datExpr[DE_info$significant==TRUE,]

pca = datExpr_DE %>% prcomp

plot_data = cbind(data.frame('ID'=rownames(datExpr_DE), 'PC1'=pca$x[,1], 'PC2'=pca$x[,2]), 
                  DE_info[DE_info$significant==TRUE,])

pos_zero = -min(plot_data$log2FoldChange)/(max(plot_data$log2FoldChange)-min(plot_data$log2FoldChange))
p = plot_data %>% ggplot(aes(PC1, PC2, color=log2FoldChange)) + geom_point(alpha=0.5) +
                  scale_color_gradientn(colours=c('#F8766D','#faa49e','white','#00BFC4','#009499'), 
                                        values=c(0, pos_zero-0.05, pos_zero, pos_zero+0.05, 1)) +
                  theme_minimal() + ggtitle('PCA of significant genes') +
                  xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
                  ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)'))
ggExtra::ggMarginal(p, type='density', color='gray', fill='gray', size=10)

rm(pos_zero, p)

Separating the probes into these two groups: Salmon: under-expressed, aqua: over-expressed

intercept=4.5
slope=-0.01
plot_data = plot_data %>% mutate('Group'=ifelse(PC2>slope*PC1+intercept,'1','2'))
plot_data %>% ggplot(aes(PC1, PC2, color=Group)) + geom_point(alpha=0.3) + 
              xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
              ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) +
              geom_abline(intercept=intercept, slope=slope, color='gray') +
              theme_minimal() + ggtitle('PCA')

rm(intercept, slope, pca)

Plotting the mean expression by group they seem to have two underlying distributions, so a Gaussian Mixture Model was fitted to each one to separate them into two Gaussians and then the points corresponding to each one were plotted in the original PCA plot.

  • Under-expressed ASD probes tend to have a higher mean expressed than over-expressed ones (I would have thought this would be the opposite way)
gg_colour_hue = function(n) {
  hues = seq(15, 375, length = n + 1)
  hcl(h = hues, l = 65, c = 100)[1:n]
}

n_clusters = 2

plot_data = plot_data %>% mutate('MeanExpr'=rowMeans(datExpr_DE), 'SDExpr'=apply(datExpr_DE,1,sd))

GMM_G1 = plot_data %>% filter(Group=='1') %>% dplyr::select(MeanExpr) %>% GMM(n_clusters)
GMM_G2 = plot_data %>% filter(Group=='2') %>% dplyr::select(MeanExpr) %>% GMM(n_clusters)

memberships_G1 = data.frame('ID'=plot_data$ID[plot_data$Group=='1'],
                            'Membership'=GMM_G1$Log_likelihood %>% apply(1, function(x) glue('1_', which.max(x))))
memberships_G2 = data.frame('ID'=plot_data$ID[plot_data$Group=='2'],
                            'Membership'=GMM_G2$Log_likelihood %>% apply(1, function(x) glue('2_', which.max(x))))

plot_data = rbind(memberships_G1, memberships_G2) %>% left_join(plot_data, by='ID')

p1 = plot_data %>% ggplot(aes(x=MeanExpr, color=Group, fill=Group)) + geom_density(alpha=0.4) + theme_minimal() +     
     theme(legend.position='none')

p2 = plot_data %>% ggplot(aes(x=MeanExpr)) +
     stat_function(fun=dnorm, n=100, colour=gg_colour_hue(2*n_clusters)[1], # red
                   args=list(mean=GMM_G1$centroids[1], sd=GMM_G1$covariance_matrices[1])) +
     stat_function(fun=dnorm, n=100, colour=gg_colour_hue(2*n_clusters)[2], # green
                   args=list(mean=GMM_G1$centroids[2], sd=GMM_G1$covariance_matrices[2])) +
     stat_function(fun=dnorm, n=100, colour=gg_colour_hue(2*n_clusters)[3], # blue
                   args=list(mean=GMM_G2$centroids[1], sd=GMM_G2$covariance_matrices[1])) +
     stat_function(fun=dnorm, n=100, colour=gg_colour_hue(2*n_clusters)[4], # purple
                  args=list(mean=GMM_G2$centroids[2], sd=GMM_G2$covariance_matrices[2])) +
     theme_minimal()

p3 = plot_data %>% ggplot(aes(PC1, PC2, color=Membership)) + geom_point(alpha=0.3) + theme_minimal() + 
     theme(legend.position='bottom')

grid.arrange(p1, p2, p3, nrow=1)

rm(gg_color_hue, n_clusters, GMM_G1, GMM_G2, memberships_G1, memberships_G2, p1, p2)
## Warning in rm(gg_color_hue, n_clusters, GMM_G1, GMM_G2, memberships_G1, :
## object 'gg_color_hue' not found

For previous preprocessing pipelines, the pattern found above was also present in the standard deviation, but there doesn’t seem to be any distinguishible patterns now. This could be because the variance was almost homogenised with the vst normalisation algorithm.

plot_data %>% ggplot(aes(x=SDExpr, color=Group, fill=Group)) + geom_density(alpha=0.4) + theme_minimal()

rm(plot_data)



Effects of modifying the log fold change threshold

lfc_list = seq(0, 0.35, 0.02)

n_genes = nrow(datExpr)

# Calculate PCAs
datExpr_pca_samps = datExpr %>% data.frame %>% t %>% prcomp(scale.=TRUE)
datExpr_pca_genes = datExpr %>% data.frame %>% prcomp(scale.=TRUE)

# Initialice DF to save PCA outputs
pcas_samps = datExpr_pca_samps$x %>% data.frame %>% dplyr::select(PC1:PC2) %>% 
             mutate('ID'=colnames(datExpr), 'lfc'=-1, PC1=scale(PC1), PC2=scale(PC2))
pcas_genes = datExpr_pca_genes$x %>% data.frame %>% dplyr::select(PC1:PC2) %>% 
             mutate('ID'=rownames(datExpr), 'lfc'=-1, PC1=scale(PC1), PC2=scale(PC2))

pca_samps_old = pcas_samps
pca_genes_old = pcas_genes

for(lfc in lfc_list){
  
  # Recalculate DE_info with the new threshold (p-values change) an filter DE probes
  DE_genes = results(dds, lfcThreshold=lfc, altHypothesis='greaterAbs') %>% data.frame %>%
             mutate('ID'=rownames(.)) %>% filter(padj<0.05 & abs(log2FoldChange)>lfc)
  
  datExpr_DE = datExpr %>% data.frame %>% filter(rownames(.) %in% DE_genes$ID)
  n_genes = c(n_genes, nrow(DE_genes))
  
  # Calculate PCAs
  datExpr_pca_samps = datExpr_DE %>% t %>% prcomp(scale.=TRUE)
  datExpr_pca_genes = datExpr_DE %>% prcomp(scale.=TRUE)

  # Create new DF entries
  pca_samps_new = datExpr_pca_samps$x %>% data.frame %>% dplyr::select(PC1:PC2) %>% 
                  mutate('ID'=colnames(datExpr), 'lfc'=lfc, PC1=scale(PC1), PC2=scale(PC2))
  pca_genes_new = datExpr_pca_genes$x %>% data.frame %>% dplyr::select(PC1:PC2) %>% 
                  mutate('ID'=DE_genes$ID, 'lfc'=lfc, PC1=scale(PC1), PC2=scale(PC2))  
  
  # Change PC sign if necessary
  if(cor(pca_samps_new$PC1, pca_samps_old$PC1)<0) pca_samps_new$PC1 = -pca_samps_new$PC1
  if(cor(pca_samps_new$PC2, pca_samps_old$PC2)<0) pca_samps_new$PC2 = -pca_samps_new$PC2
  if(cor(pca_genes_new$PC1, pca_genes_old[pca_genes_old$ID %in% pca_genes_new$ID,]$PC1 )<0){
    pca_genes_new$PC1 = -pca_genes_new$PC1
  }
  if(cor(pca_genes_new$PC2, pca_genes_old[pca_genes_old$ID %in% pca_genes_new$ID,]$PC2 )<0){
    pca_genes_new$PC2 = -pca_genes_new$PC2
  }
  
  pca_samps_old = pca_samps_new
  pca_genes_old = pca_genes_new
  
  # Update DFs
  pcas_samps = rbind(pcas_samps, pca_samps_new)
  pcas_genes = rbind(pcas_genes, pca_genes_new)
  
}

# Add Diagnosis/SFARI score information
pcas_samps = pcas_samps %>% mutate('ID'=substring(ID,2)) %>% 
             left_join(datMeta, by=c('ID'='Dissected_Sample_ID')) %>%
             dplyr::select(ID, PC1, PC2, lfc, Diagnosis_, Brain_lobe)
# pcas_genes = pcas_genes %>% left_join(SFARI_genes, by='ID') %>% 
#              mutate('score'=as.factor(`gene-score`)) %>%
#              dplyr::select(ID, PC1, PC2, lfc, score)

# Plot change of number of genes
ggplotly(data.frame('lfc'=lfc_list, 'n_genes'=n_genes[-1]) %>% ggplot(aes(x=lfc, y=n_genes)) + 
         geom_point() + geom_line() + theme_minimal() + geom_vline(xintercept=log2(1.2), color='gray') +
         ggtitle('Number of remaining genes when modifying filtering threshold'))

lfc=-1 means no filtering at all, the rest of the filterings include on top of the defined lfc, an adjusted p-value lower than 0.05

Samples

Note: PC values get smaller as Log2 fold change increases, so on each iteration the values were scaled so it would be easier to compare between frames

Coloured by Diagnosis:

The higher the filtering threshold, the more concentrated the Control samples seem to get

ggplotly(pcas_samps %>% ggplot(aes(PC1, PC2, color=Diagnosis_)) + geom_point(aes(frame=lfc, ids=ID)) + 
         theme_minimal() + ggtitle('Samples PCA plot modifying filtering threshold'))


Coloured by brain region:

There doesn’t seem to be any recognisable pattern

ggplotly(pcas_samps %>% ggplot(aes(PC1, PC2, color=Brain_lobe)) + geom_point(aes(frame=lfc, ids=ID)) + 
         theme_minimal() + ggtitle('Samples PCA plot modifying filtering threshold'))


Genes

ggplotly(pcas_genes %>% ggplot(aes(PC1, PC2)) + geom_point(aes(frame=lfc, ids=ID, alpha=0.3)) + 
         theme_minimal() + ggtitle('Genes PCA plot modifying filtering threshold'))